import weather data
data("weather_df")
make basic scatterplot
weather_df |>
ggplot(aes(x = tmin, y = tmax)) +
geom_point(aes(color = name), alpha = 0.5) +
labs( #add labels!!!
x = "minimum daily temp",
y = "maximum daily temp",
title = "temperature scatterplot",
caption = "data from NOAA",
color = "location"
)
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
weather_df |>
ggplot(aes(x = tmin, y = tmax)) +
geom_point(aes(color = name), alpha = 0.5) +
labs( #add labels!!!
x = "minimum daily temp",
y = "maximum daily temp",
title = "temperature scatterplot",
caption = "data from NOAA",
color = "location"
) +
scale_x_continuous(
breaks = c(-20, 0, 25),
labels = c("-20C", "0", "25")
) +
scale_y_continuous(
trans = "sqrt", # square root transformation on y axis
limits = c(10,30) #zoom in to range of interest
)
## Warning in transformation$transform(x): NaNs produced
## Warning in scale_y_continuous(trans = "sqrt", limits = c(10, 30)): sqrt
## transformation introduced infinite values.
## Warning: Removed 843 rows containing missing values or values outside the scale range
## (`geom_point()`).
alternatively to zoom in:
weather_df |>
filter(tmax > 10, tmin < 30) |>
ggplot(aes(x = tmin, y = tmax)) +
geom_point(aes(color = name), alpha = 0.5) +
labs( #add labels!!!
x = "minimum daily temp",
y = "maximum daily temp",
title = "temperature scatterplot",
caption = "data from NOAA",
color = "location"
) +
scale_x_continuous(
breaks = c(-20, 0, 25),
labels = c("-20C", "0", "25")
)
weather_df |>
ggplot(aes(x = tmin, y = tmax)) +
geom_point(aes(color = name), alpha = 0.5) +
labs( #add labels!!!
x = "minimum daily temp",
y = "maximum daily temp",
title = "temperature scatterplot",
caption = "data from NOAA",
color = "location"
) +
scale_x_continuous(
breaks = c(-20, 0, 25),
labels = c("-20C", "0", "25")
) +
scale_color_hue(h = c(100,300)) #you can change the color palette...but like don't
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
Viridis color package for pretty plots!
ggp_temperature = #saving my base plot so i don't have to keep typing this
weather_df |>
ggplot(aes(x = tmin, y = tmax)) +
geom_point(aes(color = name), alpha = 0.5) +
labs( #add labels!!!
x = "minimum daily temp",
y = "maximum daily temp",
title = "temperature scatterplot",
caption = "data from NOAA",
color = "location"
) +
scale_x_continuous(
breaks = c(-20, 0, 25),
labels = c("-20C", "0", "25")
) +
viridis::scale_color_viridis(
discrete = TRUE
)
#alternatively to save
ggsave("weather_scatterplot.png", ggp_temperature)
## Saving 7 x 5 in image
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
update my base plot
ggp_temperature +
theme_bw() + #black and white theme
theme(legend.position = "bottom") #now legend is at bottom
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggp_temperature +
theme_minimal() + #minimal theme
theme(legend.position = "bottom")
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
central_park_df =
weather_df |>
filter(name =="CentralPark_NY")
molokai_df =
weather_df |>
filter(name == "Molokai_HI")
ggplot(data = molokai_df, aes(x = date, y = tmax, color = name)) +
geom_point() +
geom_line(data = central_park_df) #dataframe for the point plot and line plot comes from different place!
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
patchworkmake three plots and combine using patchwork
ggp_tmax_tmin =
weather_df |>
ggplot(aes(x = tmin, y = tmax, color = name)) +
geom_point(alpha = 0.5) +
theme(legend.position = "none") # remove legend
ggp_prec_density =
weather_df |>
filter(prcp > 0) |>
ggplot(aes(x = prcp, fill = name)) +
geom_density(alpha = 0.5) +
theme(legend.position = "none")
ggp_temp_season =
weather_df |>
ggplot(aes(x = date, y = tmax, color = name)) +
geom_point(alpha = 0.5) +
geom_smooth(se = FALSE) +
theme(legend.position = "bottom")
(ggp_tmax_tmin + ggp_prec_density)
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
(ggp_tmax_tmin + ggp_prec_density) / ggp_temp_season
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
let’s make temperature violin plots
weather_df |>
ggplot(aes(x = name, y = tmax, fill = name)) + #puts factors in alphabetical order by default (x axis name)
geom_violin(alpha = 0.5)
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
#what if I want to change the order of the locations on the x axis to not appear alphabetical order?
weather_df |>
mutate(name = fct_relevel(name, c("Molokai_HI", "CentralPark_NY", "Waterhole_WA"))) |>
#reordered for now it goes HI, NY, then WA (b/c I assigned that order)
ggplot(aes(x = name, y = tmax, fill = name)) +
geom_violin(alpha = 0.5)
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
weather_df |>
mutate(name = fct_reorder(name, tmax)) |> #puts name in order of average tmax
ggplot(aes(x = name, y = tmax, fill = name)) +
geom_violin(alpha = 0.5)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `name = fct_reorder(name, tmax)`.
## Caused by warning:
## ! `fct_reorder()` removing 17 missing values.
## ℹ Use `.na_rm = TRUE` to silence this message.
## ℹ Use `.na_rm = FALSE` to preserve NAs.
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
What about data tidyness?
#let's say I want a plot of bdi score over time. But right now the dataset isn't suited for that, untidy, bdi score spread across columns, visit number squished together with column name, we fix!
pulse_df =
haven::read_sas("data/public_pulse_data.sas7bdat") |>
janitor::clean_names() |>
pivot_longer(
bdi_score_bl:bdi_score_12m,
names_to = "visit",
names_prefix = "bdi_score_",
values_to = "bdi"
) |>
mutate(visit = fct_inorder(visit)) #factors in order that data appears in dataset
pulse_df |>
ggplot(aes(x= visit, y = bdi)) +
geom_boxplot() #but, the order we have originally is 01, 06, 12, bl. we want BL first! So we mutated and then now bl is first
## Warning: Removed 879 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Trying to create a grid of plots with 4 columns 2 rows, dose on x axis, days until outcome is y axis, day 7 is the top row, day 8 is the bottom row, and the different columns across the top are ears, pivot, eyes and walk.
Facet: day of tx, outcome
So we need dataframe with pup, dose, treatment, outcome, post-natal (pn) day
COMPLICATED OK LET’S GO
make a plot for the FAS study
pups_df =
read_csv("data/FAS_pups.csv", na = c("NA", ".", ""), skip = 3) |>
janitor::clean_names() |>
mutate(
sex = case_match(
sex,
1 ~ "male",
2 ~ "female"
)
)
## Rows: 313 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Litter Number
## dbl (5): Sex, PD ears, PD eyes, PD pivot, PD walk
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
litters_df =
read_csv("data/FAS_litters.csv", na =c("NA", ".", "")) |>
janitor::clean_names() |>
separate(group, into = c("dose", "tx_day"), sep = 3)
## Rows: 49 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Group, Litter Number
## dbl (6): GD0 weight, GD18 weight, GD of Birth, Pups born alive, Pups dead @ ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
fas_df =
left_join(pups_df, litters_df, by = "litter_number")
fas_df |>
select(pd_ears:tx_day) |> #only keeping some of the variables I want
pivot_longer(
pd_ears:pd_walk,
names_to = "outcome",
names_prefix = "pd_",
values_to = "pn_day" ) |>
mutate(outcome = fct_reorder(outcome, pn_day)) |> #reordering based on average pn day outcome occurs
drop_na() |>
ggplot(aes(x = dose, y = pn_day)) +
geom_violin() +
facet_grid(tx_day ~ outcome)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `outcome = fct_reorder(outcome, pn_day)`.
## Caused by warning:
## ! `fct_reorder()` removing 44 missing values.
## ℹ Use `.na_rm = TRUE` to silence this message.
## ℹ Use `.na_rm = FALSE` to preserve NAs.